In [ ]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#set background color grey
sns.set_theme(style="darkgrid")
In [ ]:
df = pd.read_csv("all_turns_2.csv")
df = df[df['person_robot'] == 'person']
df.drop(columns=['Unnamed: 0'], inplace=True)
df['turn_duration'] = 0.2*(df['end_idx'].astype('float') - df['start_idx'].astype('float'))
df.describe().T
Out[Â ]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| participant_id | 103.0 | 1825.067961 | 633.333838 | 407.000000 | 2101.500000 | 2104.000000 | 2107.000000 | 2111.000000 |
| path_num | 103.0 | 2.233010 | 0.730349 | 1.000000 | 2.000000 | 2.000000 | 3.000000 | 3.000000 |
| turn_num | 103.0 | 2.640777 | 1.708424 | 1.000000 | 1.000000 | 2.000000 | 4.000000 | 9.000000 |
| start_idx | 103.0 | 551.747573 | 438.237596 | 50.000000 | 255.000000 | 456.000000 | 674.000000 | 2199.000000 |
| end_idx | 103.0 | 594.553398 | 437.257983 | 86.000000 | 298.000000 | 482.000000 | 713.500000 | 2225.000000 |
| walking_direction_lag | 103.0 | -3.155340 | 25.493081 | -151.000000 | -10.000000 | -1.000000 | 4.000000 | 107.000000 |
| walking_direction_base_corr | 103.0 | 0.085065 | 0.434108 | -0.857578 | -0.315911 | 0.098156 | 0.443063 | 0.941918 |
| walking_direction_lagged_corr | 103.0 | 0.468114 | 0.174327 | 0.122957 | 0.342861 | 0.439514 | 0.584931 | 0.948226 |
| walking_direction_dtw | 103.0 | 37.905734 | 24.331792 | 4.239983 | 22.831765 | 31.335852 | 44.304690 | 151.617539 |
| speeds_lag | 103.0 | -0.485437 | 13.936033 | -48.000000 | -7.000000 | -1.000000 | 3.000000 | 62.000000 |
| speeds_base_corr | 103.0 | 0.219284 | 0.364609 | -0.797566 | -0.053144 | 0.222804 | 0.507383 | 0.881916 |
| speeds_lagged_corr | 103.0 | 0.514786 | 0.165729 | 0.171078 | 0.380823 | 0.485911 | 0.670622 | 0.881916 |
| speeds_dtw | 103.0 | 29.832122 | 15.988083 | 10.358754 | 21.324705 | 24.668994 | 34.570697 | 103.965381 |
| mean_distance | 103.0 | 2.384638 | 1.791928 | 0.433665 | 1.362685 | 2.025793 | 2.836760 | 13.639054 |
| mean_speed_difference | 103.0 | 0.336295 | 0.145443 | 0.094192 | 0.237638 | 0.298077 | 0.398279 | 0.922073 |
| mean_walking_direction_difference | 103.0 | 63.593606 | 20.966348 | 14.479058 | 48.625379 | 66.074569 | 80.126460 | 100.850062 |
| mean_pace_asymmetry | 103.0 | 0.430152 | 0.139142 | 0.110604 | 0.347419 | 0.421565 | 0.504862 | 0.876306 |
| turn_duration | 103.0 | 8.561165 | 5.561919 | 5.000000 | 5.400000 | 6.400000 | 9.400000 | 33.600000 |
In [ ]:
# box plot turn duration
sns.boxplot(x=df['turn_duration'])
plt.title('Box plot of turn duration')
plt.show()
In [ ]:
df['normalized_walking_direction_dtw'] = df['walking_direction_dtw'] / (df['turn_duration'] / 0.2)
df['normalized_speeds_dtw'] = df['speeds_dtw'] / (df['turn_duration'] / 0.2)
In [ ]:
df['abs_walking_direction_lag'] = df['walking_direction_lag'].abs()
df['abs_speeds_lag'] = df['speeds_lag'].abs()
In [ ]:
relevant_features = [
'turn_duration',
'mean_distance',
'mean_pace_asymmetry',
'walking_direction_lag',
'abs_walking_direction_lag',
'walking_direction_dtw',
'normalized_walking_direction_dtw',
# 'walking_direction_base_corr',
'walking_direction_lagged_corr',
# 'mean_walking_direction_difference',
'speeds_lag',
'abs_speeds_lag',
'speeds_dtw',
'normalized_speeds_dtw',
# 'speeds_base_corr',
'speeds_lagged_corr',
# 'mean_speed_difference',
]
In [ ]:
corr = df[relevant_features].corr(method='pearson',numeric_only=True)
mask = np.abs(corr) < 0.3
plt.figure(figsize=(12, 10))
sns.heatmap(corr, annot=True, fmt=".2f", mask=mask)
# make it bigger
plt.title(f"Metrics Correlation Matrix - Original Data (n={len(df)})")
plt.show()
In [ ]:
# find rows with the same participant_id and path_num with overlapping start_idx and end_idx
df['overlapping'] = False
for index, row in df.iterrows():
if len(df[(df['participant_id'] == row['participant_id']) & (df['path_num'] == row['path_num']) & (df['start_idx'] >= row['start_idx']) & (df['start_idx'] <= row['end_idx'])]) > 1 or \
len(df[(df['participant_id'] == row['participant_id']) & (df['path_num'] == row['path_num']) & (df['end_idx'] >= row['start_idx']) & (df['end_idx'] <= row['end_idx'])]) > 1:
df.at[index, 'overlapping'] = True
# overlapping_and_not_subset = df[df['overlapping'] == True]
overlapping_and_not_subset = df
oans = overlapping_and_not_subset
corr_oans = oans[relevant_features].corr(method='pearson',numeric_only=True)
mask = np.abs(corr_oans) < 0.3
plt.figure(figsize=(12, 10))
sns.heatmap(corr_oans, annot=True, fmt=".2f", mask=mask)
plt.title(f"Metrics Correlation Matrix - Overlapping Data (n={len(oans)})")
plt.show()
In [ ]:
threshold = 0.3
filtered_df = df[(df['walking_direction_lagged_corr'] > threshold) & (df['speeds_lagged_corr'] > threshold)]
# filtered_oans = filtered_df[filtered_df['overlapping'] == True]
filtered_oans = filtered_df
corr_filtered_oans = filtered_oans[relevant_features].corr(method='pearson',numeric_only=True)
mask = (np.abs(corr_filtered_oans) < 0.3)
plt.figure(figsize=(12, 10))
sns.heatmap(corr_filtered_oans, annot=True, fmt=".2f", mask=mask)
plt.title(f"Metrics Correlation Matrix - Filtered Overlapping Data (n={len(filtered_oans)})")
plt.show()
In [ ]:
filtered_oans.describe().T
Out[Â ]:
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| participant_id | 80.0 | 1787.050000 | 666.917107 | 407.000000 | 2101.000000 | 2103.500000 | 2107.000000 | 2111.000000 |
| path_num | 80.0 | 2.237500 | 0.697985 | 1.000000 | 2.000000 | 2.000000 | 3.000000 | 3.000000 |
| turn_num | 80.0 | 2.612500 | 1.789518 | 1.000000 | 1.000000 | 2.000000 | 4.000000 | 9.000000 |
| start_idx | 80.0 | 557.275000 | 454.371648 | 54.000000 | 256.000000 | 456.000000 | 668.000000 | 2199.000000 |
| end_idx | 80.0 | 595.600000 | 453.169670 | 108.000000 | 300.000000 | 482.500000 | 698.250000 | 2225.000000 |
| walking_direction_lag | 80.0 | -1.662500 | 12.969237 | -41.000000 | -8.250000 | -1.000000 | 4.000000 | 47.000000 |
| walking_direction_base_corr | 80.0 | 0.115905 | 0.467918 | -0.857578 | -0.328202 | 0.212734 | 0.511793 | 0.941918 |
| walking_direction_lagged_corr | 80.0 | 0.517046 | 0.159682 | 0.315465 | 0.377550 | 0.467425 | 0.615642 | 0.948226 |
| walking_direction_dtw | 80.0 | 33.305556 | 18.261076 | 4.239983 | 21.018501 | 27.573042 | 42.750365 | 107.642701 |
| speeds_lag | 80.0 | -2.175000 | 9.865905 | -30.000000 | -7.000000 | -1.000000 | 0.000000 | 23.000000 |
| speeds_base_corr | 80.0 | 0.245733 | 0.373104 | -0.797566 | -0.031807 | 0.288961 | 0.542515 | 0.881916 |
| speeds_lagged_corr | 80.0 | 0.538861 | 0.148699 | 0.309378 | 0.428816 | 0.512229 | 0.679896 | 0.881916 |
| speeds_dtw | 80.0 | 26.937057 | 12.986413 | 10.358754 | 19.770054 | 23.836152 | 31.466053 | 90.532321 |
| mean_distance | 80.0 | 2.223850 | 1.635995 | 0.433665 | 1.364710 | 1.950531 | 2.689676 | 13.639054 |
| mean_speed_difference | 80.0 | 0.334028 | 0.133022 | 0.094192 | 0.246300 | 0.296413 | 0.405460 | 0.922073 |
| mean_walking_direction_difference | 80.0 | 62.887446 | 21.507993 | 14.479058 | 47.266777 | 64.734566 | 78.912842 | 100.850062 |
| mean_pace_asymmetry | 80.0 | 0.432057 | 0.128814 | 0.122233 | 0.353634 | 0.420388 | 0.509542 | 0.870018 |
| turn_duration | 80.0 | 7.665000 | 4.173063 | 5.000000 | 5.200000 | 5.800000 | 8.200000 | 33.400000 |
| normalized_walking_direction_dtw | 80.0 | 0.908792 | 0.384533 | 0.146206 | 0.632673 | 0.802530 | 1.103255 | 2.031908 |
| normalized_speeds_dtw | 80.0 | 0.734038 | 0.216814 | 0.256842 | 0.587907 | 0.732877 | 0.855633 | 1.422129 |
| abs_walking_direction_lag | 80.0 | 8.937500 | 9.492627 | 0.000000 | 1.750000 | 6.000000 | 13.000000 | 47.000000 |
| abs_speeds_lag | 80.0 | 6.925000 | 7.318738 | 0.000000 | 1.000000 | 5.000000 | 10.000000 | 30.000000 |
In [ ]:
from scipy.stats import pearsonr
for feature in relevant_features:
to_display = []
for feature2 in relevant_features:
if not feature.startswith(feature2) and not feature2.startswith(feature) and not feature.endswith(feature2) and not feature2.endswith(feature)\
and np.abs(corr_filtered_oans.loc[feature, feature2]) > 0.3:
to_display.append(feature2)
if len(to_display) == 0:
continue
# set plot grid of 1xlen(to_display)
fig, axs = plt.subplots(int(np.ceil(len(to_display)/3)), min(len(to_display),3), figsize=(5*min(len(to_display),3),5*int(np.ceil(len(to_display)/3))))
# print(axs.shape)
axs = np.atleast_2d(axs) # Ensure axs is always an array, even if it's a single subplot
for i, feature2 in enumerate(to_display):
peares = pearsonr(filtered_oans[feature], filtered_oans[feature2], alternative='two-sided')
pcorr, p_val = peares.statistic, peares.pvalue
CI = peares.confidence_interval(confidence_level=0.95)
# Scatter plot
sns.scatterplot(x=feature, y=feature2, data=filtered_oans, ax=axs[int(np.ceil(i/3))-1,i%3])
# Regression line
sns.regplot(x=feature, y=feature2, data=filtered_oans, scatter=False, line_kws={'color': 'red'}, ax=axs[int(np.ceil(i/3))-1,i%3])
axs[int(np.ceil(i/3))-1,i%3].set_title(f"compared with {feature2}\ncorr: {round(corr_filtered_oans.loc[feature, feature2], 3)}, p_val: {round(p_val,5)}, CI: {[round(c,3) for c in CI]}", fontweight='bold')
# add title "feature vs correlated features" to the plot
fig.suptitle(f"{feature}'s correlations", fontweight='bold')
plt.tight_layout()
plt.show()
In [ ]:
from PIL import Image
import seaborn as sns
from scipy import stats
# for each feature, find highest and lowest valued row and display them
for feature in relevant_features:
# print(f"Feature: {feature}")
# print("Highest values:")
h_res = filtered_oans.loc[filtered_oans[feature].nlargest(1).index, ['participant_id', 'person_robot', 'path_num', 'turn_num', feature]]
# print(h_res)
h_base_path = f"./turns/{h_res['participant_id'].values[0]}/{h_res['person_robot'].values[0]}/run_{h_res['path_num'].values[0]}/turn_{h_res['turn_num'].values[0]}/"
fig, axs = plt.subplots(1, 4, figsize=(20, 5))
paths_img = Image.open(h_base_path + "paths.png")
axs[0].imshow(paths_img)
axs[0].axis('off')
distance_img = Image.open(h_base_path + "distance.png")
axs[1].imshow(distance_img)
axs[1].axis('off')
walking_directions_img = Image.open(h_base_path + "walking_directions.png")
axs[2].imshow(walking_directions_img)
axs[2].axis('off')
speeds_img = Image.open(h_base_path + "speeds.png")
axs[3].imshow(speeds_img)
axs[3].axis('off')
to_print_dict = {k: round(v_val, 3) if isinstance(v_val, float) else v_val for k,v in h_res.to_dict().items() for v_key, v_val in v.items()}
to_print_str = ", ".join([f"{k}: {v}" for k,v in to_print_dict.items()])
fig.suptitle(f"{feature} - highest value\n {to_print_str}", fontweight='bold')
plt.tight_layout()
plt.show()
# print("Lowest values:")
l_res = filtered_oans.loc[filtered_oans[feature].nsmallest(1).index, ['participant_id', 'person_robot', 'path_num', 'turn_num', feature]]
# print(l_res)
l_base_path = f"./turns/{l_res['participant_id'].values[0]}/{l_res['person_robot'].values[0]}/run_{l_res['path_num'].values[0]}/turn_{l_res['turn_num'].values[0]}/"
fig, axs = plt.subplots(1, 4, figsize=(20, 5))
paths_img = Image.open(l_base_path + "paths.png")
axs[0].imshow(paths_img)
axs[0].axis('off')
distance_img = Image.open(l_base_path + "distance.png")
axs[1].imshow(distance_img)
axs[1].axis('off')
walking_directions_img = Image.open(l_base_path + "walking_directions.png")
axs[2].imshow(walking_directions_img)
axs[2].axis('off')
speeds_img = Image.open(l_base_path + "speeds.png")
axs[3].imshow(speeds_img)
axs[3].axis('off')
to_print_dict = {k: round(v_val, 3) if isinstance(v_val, float) else v_val for k,v in l_res.to_dict().items() for v_key, v_val in v.items()}
to_print_str = ", ".join([f"{k}: {v}" for k,v in to_print_dict.items()])
fig.suptitle(f"{feature} - lowest value\n {to_print_str}", fontweight='bold')
plt.tight_layout()
plt.show()
print("\n\n")
In [ ]:
fig, axs = plt.subplots(int(np.ceil(len(relevant_features)/3)), min(len(relevant_features),3), figsize=(15, 5*int(np.ceil(len(relevant_features)/3))))
axs = np.atleast_2d(axs) # Ensure axs is always an array, even if it's a single subplot
for i, feature in enumerate(relevant_features):
sns.histplot(data=filtered_oans, x=feature, kde=True, ax=axs[int(np.ceil(i/3))-1,i%3])
axs[int(np.ceil(i/3))-1,i%3].set_title(feature)
axs[int(np.ceil(i/3))-1,i%3].set_xlabel('')
axs[int(np.ceil(i/3))-1,i%3].set_ylabel('')
plt.tight_layout()
plt.show()
In [ ]:
import scipy.stats as stats
fig, axs = plt.subplots(int(np.ceil(len(relevant_features)/3)), min(len(relevant_features),3), figsize=(15, 5*int(np.ceil(len(relevant_features)/3))))
axs = np.atleast_2d(axs) # Ensure axs is always an array, even if it's a single subplot
for i, feature in enumerate(relevant_features):
stats.probplot(filtered_oans[feature], dist="norm", plot=axs[int(np.ceil(i/3))-1,i%3])
axs[int(np.ceil(i/3))-1,i%3].set_title(feature)
axs[int(np.ceil(i/3))-1,i%3].set_xlabel('Theoretical Quantiles')
axs[int(np.ceil(i/3))-1,i%3].set_ylabel('Ordered Values')
plt.tight_layout()
plt.show()
In [ ]:
fig, axs = plt.subplots(int(np.ceil(len(relevant_features)/3)), min(len(relevant_features),3), figsize=(15, 5*int(np.ceil(len(relevant_features)/3))))
axs = np.atleast_2d(axs) # Ensure axs is always an array, even if it's a single subplot
for i, feature in enumerate(relevant_features):
sns.boxplot(data=filtered_oans, y=feature, ax=axs[int(np.ceil(i/3))-1,i%3])
axs[int(np.ceil(i/3))-1,i%3].set_title(feature)
axs[int(np.ceil(i/3))-1,i%3].set_xlabel('')
axs[int(np.ceil(i/3))-1,i%3].set_ylabel('')
plt.tight_layout()
plt.show()